{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://raw.githubusercontent.com/huseinzol05/Malaya-Dataset/master/language-detection/language-detection-data-v5.json.zip\n",
    "# !unzip language-detection-data-v5.json.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('language-detection-data-v5.json') as fopen:\n",
    "    lang = json.load(fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "331597"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "others = [text for no, text in enumerate(lang['text']) if lang['label'][no] not in ['zlm', 'ind', 'eng']]\n",
    "len(others)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "50000"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "eng = [text for no, text in enumerate(lang['text']) if lang['label'][no] == 'eng']\n",
    "len(eng)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "53692"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "zlm = [text for no, text in enumerate(lang['text']) if lang['label'][no] == 'zlm']\n",
    "len(zlm)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "zlm_words = set(' '.join(zlm).split())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "57327"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ind = [text for no, text in enumerate(lang['text']) if lang['label'][no] == 'ind']\n",
    "len(ind)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "ind_words = set(' '.join(ind).split())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(17286, 7614)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(zlm_words), len(ind_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://raw.githubusercontent.com/huseinzol05/Malaya-Dataset/master/200k-english-malay/200k-english-malay.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "150125"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open('200k-english-malay.json') as fopen:\n",
    "    english_malay = json.load(fopen)\n",
    "    \n",
    "malays = set([i[1] for i in english_malay])\n",
    "len(malays)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://dl.fbaipublicfiles.com/arrival/dictionaries/en-ms.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('en-ms.txt') as fopen:\n",
    "    en_ms = list(filter(None, fopen.read().split('\\n')))\n",
    "    \n",
    "en_ms = [i.split('\\t') for i in en_ms]\n",
    "en_ms = set([i[1] for i in en_ms if i[1] != i[0]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "malays = malays | en_ms"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "393429"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open('id-wiki.txt') as fopen:\n",
    "    id_wiki = fopen.read().split('\\n')\n",
    "    \n",
    "len(id_wiki)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "from unidecode import unidecode\n",
    "import cleaning\n",
    "from tqdm import tqdm\n",
    "\n",
    "def preprocessing(string):\n",
    "    string = re.sub(\n",
    "        'http\\S+|www.\\S+',\n",
    "        '',\n",
    "        ' '.join(\n",
    "            [i for i in string.split() if i.find('#') < 0 and i.find('@') < 0]\n",
    "        ),\n",
    "    )\n",
    "    \n",
    "    chars = ',.()!:\\'\"/;=-'\n",
    "    for c in chars:\n",
    "        string = string.replace(c, f' {c} ')\n",
    "        \n",
    "    string = re.sub(\n",
    "        u'[0-9!@#$%^&*()_\\-+{}|\\~`\\'\";:?/.>,<]',\n",
    "        ' ',\n",
    "        string,\n",
    "        flags = re.UNICODE,\n",
    "    )\n",
    "    string = re.sub(r'[ ]+', ' ', string).strip()\n",
    "    \n",
    "    return string.lower()\n",
    "\n",
    "def loop(strings):\n",
    "    for i in tqdm(range(len(strings))):\n",
    "        strings[i] = preprocessing(strings[i])\n",
    "    return strings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 24589/24589 [00:01<00:00, 14574.65it/s]\n",
      "100%|██████████| 5/5 [00:00<00:00, 5830.28it/s]24it/s]s]\n",
      "100%|██████████| 24589/24589 [00:02<00:00, 10878.90it/s]\n",
      "100%|██████████| 24589/24589 [00:02<00:00, 10303.26it/s]\n",
      "100%|██████████| 24589/24589 [00:02<00:00, 10148.64it/s]\n",
      "100%|██████████| 24589/24589 [00:06<00:00, 3733.98it/s]\n",
      "100%|██████████| 24589/24589 [00:07<00:00, 3429.41it/s]\n",
      "100%|██████████| 24589/24589 [00:08<00:00, 2744.80it/s]\n",
      "100%|██████████| 24589/24589 [00:08<00:00, 2843.67it/s]\n",
      "100%|██████████| 24589/24589 [00:08<00:00, 2757.06it/s]\n",
      "100%|██████████| 24589/24589 [00:07<00:00, 3160.58it/s]\n",
      "100%|██████████| 24589/24589 [00:08<00:00, 2941.28it/s]\n",
      "100%|██████████| 24589/24589 [00:10<00:00, 2457.34it/s]\n",
      "100%|██████████| 24589/24589 [00:09<00:00, 2701.60it/s]\n",
      "100%|██████████| 24589/24589 [00:09<00:00, 2618.47it/s]\n",
      " 50%|████▉     | 12229/24589 [00:11<00:07, 1563.74it/s]\n",
      "100%|██████████| 24589/24589 [00:17<00:00, 1373.61it/s]\n"
     ]
    }
   ],
   "source": [
    "id_wiki = cleaning.multiprocessing(id_wiki, loop)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1624903"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "id_wiki_words = set(' '.join(id_wiki).split())\n",
    "len(id_wiki_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "16353"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open('indon-news.json') as fopen:\n",
    "    news = json.load(fopen)\n",
    "    \n",
    "indon_news = []\n",
    "\n",
    "for n in news:\n",
    "    indon_news.extend(n['text'].split('.'))\n",
    "\n",
    "indon_news = list(filter(None, indon_news))\n",
    "\n",
    "len(indon_news)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 1022/1022 [00:00<00:00, 37609.49it/s]\n",
      "100%|██████████| 1/1 [00:00<00:00, 6223.00it/s]28it/s]\n",
      "100%|██████████| 1022/1022 [00:00<00:00, 35556.74it/s]\n",
      "100%|██████████| 1022/1022 [00:00<00:00, 32028.83it/s]\n",
      "100%|██████████| 1022/1022 [00:00<00:00, 28869.54it/s]\n",
      "\n",
      "100%|██████████| 1022/1022 [00:00<00:00, 30735.87it/s]\n",
      "\n",
      "\n",
      "\n",
      "100%|██████████| 1022/1022 [00:00<00:00, 30567.55it/s]\n",
      "\n",
      "\n",
      "100%|██████████| 1022/1022 [00:00<00:00, 21270.92it/s]\n",
      "100%|██████████| 1022/1022 [00:00<00:00, 21297.76it/s]\n",
      "100%|██████████| 1022/1022 [00:00<00:00, 21817.31it/s]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "indon_news = cleaning.multiprocessing(indon_news, loop)\n",
    "\n",
    "indon_news_words = set(' '.join(indon_news).split())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "ind_words = ind_words | id_wiki_words | indon_news_words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://malaya-dataset.s3-ap-southeast-1.amazonaws.com/wikidump1-raw.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1748387"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open('wikidump1-raw.json') as fopen:\n",
    "    ms_wiki = json.load(fopen)\n",
    "    \n",
    "len(ms_wiki)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1893949"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ms_wiki_words = set(' '.join(ms_wiki).split())\n",
    "len(ms_wiki_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "malays = malays | zlm_words | ms_wiki_words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('malays_word.json', 'w') as fopen:\n",
    "    json.dump(list(malays), fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "intersected = malays.intersection(ind_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 109274/109274 [00:02<00:00, 36467.66it/s]\n",
      "100%|██████████| 109274/109274 [00:02<00:00, 37006.81it/s]\n",
      "100%|██████████| 3/3 [00:00<00:00, 7141.27it/s]191.85it/s]\n",
      " 96%|█████████▋| 105447/109274 [00:02<00:00, 35007.38it/s]\n",
      "100%|██████████| 109274/109274 [00:03<00:00, 35601.06it/s]\n",
      "100%|██████████| 109274/109274 [00:03<00:00, 35483.45it/s]\n",
      "100%|██████████| 109274/109274 [00:03<00:00, 32738.12it/s]\n",
      "100%|██████████| 109274/109274 [00:03<00:00, 29698.04it/s]\n",
      "100%|██████████| 109274/109274 [00:03<00:00, 29837.29it/s]\n",
      " 83%|████████▎ | 90318/109274 [00:02<00:00, 35743.06it/s]\n",
      "100%|██████████| 109274/109274 [00:03<00:00, 30110.00it/s]\n",
      "100%|██████████| 109274/109274 [00:03<00:00, 28733.46it/s]\n",
      "100%|██████████| 109274/109274 [00:03<00:00, 29161.15it/s]\n",
      "100%|██████████| 109274/109274 [00:03<00:00, 27687.44it/s]\n",
      "100%|██████████| 109274/109274 [00:03<00:00, 34451.81it/s]\n",
      "100%|██████████| 109274/109274 [00:03<00:00, 33553.86it/s]\n",
      "100%|██████████| 109274/109274 [00:03<00:00, 34344.70it/s]\n"
     ]
    }
   ],
   "source": [
    "ms_wiki = cleaning.multiprocessing(ms_wiki, loop)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1489582"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ind_minus = ind_words - intersected\n",
    "len(ind_minus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "import malaya\n",
    "\n",
    "english = malaya.texts._english_words._english_words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 1748387/1748387 [00:10<00:00, 166086.88it/s]\n"
     ]
    }
   ],
   "source": [
    "for i in tqdm(range(len(ms_wiki))):\n",
    "    ms_wiki[i] = ' '.join([w for w in ms_wiki[i].split() if w not in ind_minus and w not in english])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 53692/53692 [00:01<00:00, 31482.41it/s]\n"
     ]
    }
   ],
   "source": [
    "for i in tqdm(range(len(zlm))):\n",
    "    zlm[i] = ' '.join([w for w in zlm[i].split() if w not in ind_minus and w not in english])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('ind_words.json', 'w') as fopen:\n",
    "    json.dump(list(ind_minus), fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('formal-language.json', 'w') as fopen:\n",
    "    json.dump({'other': others,\n",
    "              'malay': zlm + ms_wiki,\n",
    "              'ind': ind + id_wiki,\n",
    "              'eng': eng}, fopen)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  },
  "vscode": {
   "interpreter": {
    "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
